In [ ]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import joblib
import os

import warnings
warnings.filterwarnings('ignore')  
plt.rcParams['font.sans-serif']=['SimHei'] #显示中文  

# 设置Seaborn样式
sns.set(font='Microsoft YaHei')
d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\_distributor_init.py:32: UserWarning: loaded more than 1 DLL from .libs:
d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  stacklevel=1)
In [ ]:
# 获取字典保存各个模型的最终结果
result_model = {}

output_file = 'output/TN_NH3_N2O'
input_file = 'data/TN_NH3_N2O'
model_save_file = 'output/TN_NH3_N2O/model'
os.makedirs(output_file, exist_ok=True)
os.makedirs(model_save_file, exist_ok=True)

# target = ['TN loss (%)', 'NH3-N loss (%)', 'N2O-N loss (%)']
target = 'TN loss (%)' # 要预测啥就换个名字
# target = ['EF %', 'LN(EF)', 'LNR(N2O)', 'N2O rate(kg N ha-1 y-1)']
In [ ]:
data_path = f'{input_file}/data_for_{target}.csv'
data_tree_path = f'{input_file}/data_for_tree_{target}.csv'
model_performance_path = f'{output_file}/Model_{target}.html'
model_performance_json = f'{output_file}/result_model_{target}.json'

数据集划分¶

In [ ]:
data_all_ef = pd.read_csv(data_path)
X_all = data_all_ef.iloc[:, :-1]
y_all = data_all_ef.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=2023)
y_train = y_train.reset_index(drop=True)

## 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)

input_cols = X_all.columns.tolist()
In [ ]:
data_all_tree = pd.read_csv(data_tree_path)
X_allt = data_all_tree.iloc[:, :-1]
y_allt = data_all_tree.iloc[:, -1]

X_traint, X_testt, y_traint, y_testt = train_test_split(X_all, y_all, test_size=0.2, random_state=2023)
y_traint = y_traint.reset_index(drop=True)

## 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)
In [ ]:
X_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 450 entries, 466 to 537
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   material_0                   450 non-null    int64  
 1   initial CN(%)                450 non-null    float64
 2   initial moisture content(%)  450 non-null    float64
 3   initial pH                   450 non-null    float64
 4   material_1                   450 non-null    int64  
 5   Excipients                   450 non-null    int64  
 6   initial TN(%)                450 non-null    float64
 7   initial TC(%)                450 non-null    float64
 8   Additive Species             450 non-null    int64  
dtypes: float64(5), int64(4)
memory usage: 35.2 KB
In [ ]:
X_train.describe()
Out[ ]:
material_0 initial CN(%) initial moisture content(%) initial pH material_1 Excipients initial TN(%) initial TC(%) Additive Species
count 450.000000 450.000000 450.000000 450.000000 450.000000 450.000000 450.000000 450.000000 450.000000
mean 2.931111 15.603382 34.574044 4.088483 3.382222 27.482222 0.975466 23.110170 3.066667
std 0.790907 13.498014 32.949480 4.245029 1.749488 13.073746 1.740370 23.320782 1.180766
min 0.000000 -1.000000 -1.000000 -1.000000 0.000000 0.000000 -1.000000 -1.000000 0.000000
25% 3.000000 -1.000000 -1.000000 -1.000000 3.000000 17.000000 -1.000000 -1.000000 3.000000
50% 3.000000 17.980000 56.230000 6.657690 4.000000 34.000000 1.358654 30.325000 3.000000
75% 3.000000 25.000000 65.000000 7.795000 5.000000 39.000000 2.070000 39.800000 4.000000
max 5.000000 53.734940 89.800000 10.700000 5.000000 39.000000 13.400000 196.750000 4.000000
In [ ]:
X_train
Out[ ]:
material_0 initial CN(%) initial moisture content(%) initial pH material_1 Excipients initial TN(%) initial TC(%) Additive Species
466 3 20.0 65.0 -1.00000 3 5 -1.00 -1.00 4
148 2 -1.0 -1.0 -1.00000 5 39 1.40 -1.00 3
260 3 25.0 60.0 7.60000 3 18 7.87 196.75 4
183 3 -1.0 -1.0 6.85618 4 39 -1.00 -1.00 1
450 3 15.0 75.0 -1.00000 4 6 2.40 35.30 4
... ... ... ... ... ... ... ... ... ...
470 3 32.0 49.6 7.84000 3 23 1.15 37.30 4
52 4 -1.0 60.5 -1.00000 5 39 -1.00 -1.00 3
515 3 25.0 -1.0 -1.00000 3 23 -1.00 -1.00 4
454 3 18.0 70.0 -1.00000 4 6 2.10 38.00 4
537 2 30.0 60.6 -1.00000 5 14 1.41 36.80 4

450 rows × 9 columns

In [ ]:
y_train.isnull().any()
Out[ ]:
False
In [ ]:
print(X_train.isnull().any())
material_0                     False
initial CN(%)                  False
initial moisture content(%)    False
initial pH                     False
material_1                     False
Excipients                     False
initial TN(%)                  False
initial TC(%)                  False
Additive Species               False
dtype: bool
In [ ]:
np.isnan(X_train).any()
Out[ ]:
material_0                     False
initial CN(%)                  False
initial moisture content(%)    False
initial pH                     False
material_1                     False
Excipients                     False
initial TN(%)                  False
initial TC(%)                  False
Additive Species               False
dtype: bool

树模型预测(RF,XGB,LGB,CTB)¶

RF¶

In [ ]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=1000, criterion='mse',
                            max_depth=10, min_samples_split=2,
                            min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                            max_features='auto',  max_leaf_nodes=None,
                            bootstrap=True, oob_score=False,
                            n_jobs=1, random_state=None,
                            verbose=0, warm_start=False)

rf_preds = np.zeros(X_train.shape[0])

for train_index, valid_index in kf.split(X_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    print(x_tr.shape, y_tr.shape)
    rf_model.fit(x_tr, y_tr)
    rf_pred_valid = rf_model.predict(x_va)
    rf_preds[valid_index] = rf_pred_valid

    print('The rmse of the RFRegression is:',metrics.mean_squared_error(y_va,rf_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, rf_pred_valid, 'r', label='xgb predict')
    plt.legend(loc='best')
    plt.title('xgb-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(360, 9) (360,)
The rmse of the RFRegression is: 126.0314666685265
(360, 9) (360,)
The rmse of the RFRegression is: 145.5660239711958
(360, 9) (360,)
The rmse of the RFRegression is: 131.31412750536043
(360, 9) (360,)
The rmse of the RFRegression is: 119.95701259689821
(360, 9) (360,)
The rmse of the RFRegression is: 252.0317008134364
In [ ]:
rf_pred_test = rf_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,rf_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, rf_pred_test, 'r', label='rf predict')
plt.legend(loc='best')
plt.title('rf-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(rf_model, f'{model_save_file}/rf_model.pkl') 
result_model['RandomForest(k)'] = metrics.mean_squared_error(y_test,rf_pred_test)
The rmse of the test is: 152.48951026880778

XGB¶

In [ ]:
import xgboost as xgb

xgb_preds = np.zeros(X_train.shape[0])
kf = KFold(n_splits=num_folds, shuffle=True)
xgb_model = xgb.XGBRegressor(max_depth=8,
                        learning_rate=0.1,
                        n_estimators=300,
                        n_jobs=4,
                        colsample_bytree=0.8,
                        subsample=0.8,
                        random_state=32,
                        tree_method='hist'
                        )

for train_index, valid_index in kf.split(X_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    print(x_tr.shape, y_tr.shape)
    xgb_model.fit(x_tr, y_tr)
    xgb_pred_valid = xgb_model.predict(x_va)
    xgb_preds[valid_index] = xgb_pred_valid

    # print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
    print('The rmse of the XgbRegression is:',metrics.mean_squared_error(y_va,xgb_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, xgb_pred_valid, 'r', label='xgb predict')
    plt.legend(loc='best')
    plt.title('xgb-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(360, 9) (360,)
The rmse of the XgbRegression is: 150.4376191929306
(360, 9) (360,)
The rmse of the XgbRegression is: 157.80004523008589
(360, 9) (360,)
The rmse of the XgbRegression is: 106.71409876836324
(360, 9) (360,)
The rmse of the XgbRegression is: 164.3025653761061
(360, 9) (360,)
The rmse of the XgbRegression is: 167.06615300896544
In [ ]:
xgb_pred_test = xgb_model.predict(X_test)
print('The rmse of the XgbRegression is:',metrics.mean_squared_error(y_test, xgb_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, xgb_pred_test, 'r', label='xgb predict')
plt.legend(loc='best')
plt.title('xgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(xgb_model, f'{model_save_file}/xgb_model.pkl') 
result_model['XGBoost(k)'] = metrics.mean_squared_error(y_test,xgb_pred_test)
The rmse of the XgbRegression is: 161.20656664940898
In [ ]:
xgb_pred_test
Out[ ]:
array([84.90975  , 40.411938 , 84.90975  , 40.36061  , 30.81254  ,
       58.17785  , 40.813774 , 47.137566 , 36.048138 , 34.232754 ,
       36.048138 , 24.441133 , 17.381983 , 18.252796 ,  4.251944 ,
        6.372212 , 42.83542  , 40.505272 , 17.315933 , 36.048138 ,
       45.864594 , 38.36129  , 19.748665 , 48.605034 , 32.474518 ,
       29.898304 ,  8.696232 , 38.491615 ,  8.650021 , 36.20542  ,
       41.521667 , 26.360083 , 24.259657 , 40.505272 , 18.623236 ,
       35.769142 , 40.18205  , 20.484081 ,  9.788815 , 18.752897 ,
       25.451082 , 33.60631  , 35.769142 , 15.704942 , 24.441133 ,
       42.301727 , 23.094816 , 17.997398 , 31.184462 , 84.41048  ,
       16.69752  , 40.505272 , 24.441133 , 33.246864 , 24.303764 ,
       33.913746 , 37.34529  , 30.070019 , 30.56996  , 28.185186 ,
       12.755292 , 16.744617 , 18.24543  , 13.301644 , 26.466618 ,
       40.145653 , 35.153057 , 36.01605  , 34.90764  , 15.838131 ,
       21.322765 ,  0.3356682, 50.4274   , 28.103344 , 20.186169 ,
       23.172392 , 24.441133 , 47.137566 , 47.76451  , 28.274157 ,
       22.182278 , 28.796017 , 26.188715 , 34.746414 , 22.084042 ,
       25.610353 , 18.504105 , 44.952965 , 46.569164 , 40.813774 ,
       34.746414 , 18.392904 , 40.65509  , 40.07162  , 11.603221 ,
       42.74766  , 23.828777 , 80.21579  , 27.678026 , 33.407772 ,
       19.191181 , 30.66869  , 32.336376 ,  7.084119 , 24.303764 ,
       80.21579  , 28.97032  ,  0.4624677, 19.506504 , 34.489136 ,
       28.275036 ,  8.650021 , 36.209553 ], dtype=float32)

LGB¶

In [ ]:
import lightgbm as lgb

lgb_preds = np.zeros(X_traint.shape[0])
for i, (train_index, valid_index) in enumerate(kf.split(X_traint, y_traint)):
    print('************************************ {} ************************************'.format(str(i+1)))
    X_tr, y_tr, X_va, y_va = X_traint.iloc[train_index], \
        y_traint.iloc[train_index], X_traint.iloc[valid_index], y_traint.iloc[valid_index]

    params_lgb = {
                'boosting_type': 'gbdt',
                'objective': 'regression',
                'learning_rate': 0.1,
                'n_estimators': 300,
                'metric': 'root_mean_squared_error',
                'min_child_weight': 1e-3,
                'min_child_samples': 10,
                'num_leaves': 31,
                'max_depth': -1,
                'seed': 2023,
                'verbose': -1,
    }
    
    lgb_model = lgb.LGBMRegressor(**params_lgb)
    lgb_model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_va, y_va)], eval_metric=['mae','rmse'])
    lgb_val_pred = lgb_model.predict(X_va)
    lgb_preds[valid_index] = lgb_val_pred

    print('The rmse of the LgbRegression is:',metrics.mean_squared_error(y_va,lgb_val_pred))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, lgb_val_pred, 'r', label='lgb predict')
    plt.legend(loc='best')
    plt.title('lgb-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
************************************ 1 ************************************
The rmse of the LgbRegression is: 144.15979510245958
************************************ 2 ************************************
The rmse of the LgbRegression is: 131.31738146739463
************************************ 3 ************************************
The rmse of the LgbRegression is: 146.76703142514518
************************************ 4 ************************************
The rmse of the LgbRegression is: 107.73408809186085
************************************ 5 ************************************
The rmse of the LgbRegression is: 203.50332298250953
In [ ]:
lgb_pred_test = lgb_model.predict(X_testt)
print('The rmse of the test is:',metrics.mean_squared_error(y_testt,lgb_pred_test))
x = list(range(0, len(y_testt)))
plt.figure(figsize=(40,10))
plt.plot(x[:100], y_testt[:100], 'g', label='ground-turth')
plt.plot(x[:100], lgb_pred_test[:100], 'r', label='lgb predict')
plt.legend(loc='best')
plt.title('lgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(lgb_model, f'{model_save_file}/lgb_model.pkl') 
result_model['Lightgbm(k)'] = metrics.mean_squared_error(y_testt,lgb_pred_test)
The rmse of the test is: 168.0038498088402

CTB¶

In [ ]:
from catboost import CatBoostRegressor

params = {
    'learning_rate': 0.05,
    'loss_function': "RMSE",
    'eval_metric': "RMSE", # CrossEntropy
    'depth': 8,
    'min_data_in_leaf': 20,
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'one_hot_max_size': 5,   #类别数量多于此数将使用ordered target statistics编码方法,默认值为2。
    'boosting_type':"Ordered", #Ordered 或者Plain,数据量较少时建议使用Ordered,训练更慢但能够缓解梯度估计偏差。
    'max_ctr_complexity': 2, #特征组合的最大特征数量,设置为1取消特征组合,设置为2只做两个特征的组合,默认为4。
    'nan_mode': 'Min' 
}
iterations = 400
early_stopping_rounds = 200
ctb_model = CatBoostRegressor(iterations=iterations,
    early_stopping_rounds = early_stopping_rounds,
    **params)

for i, (train_index, valid_index) in enumerate(kf.split(X_traint)):
    print('************************************ {} ************************************'.format(str(i+1)))
    x_tr, x_va = X_traint.iloc[train_index], X_traint.iloc[valid_index]
    y_tr, y_va = y_traint.iloc[train_index], y_traint.iloc[valid_index]
    
    print(x_tr.shape, y_tr.shape)
    ctb_model.fit(x_tr, y_tr, eval_set=(x_va, y_va), verbose=0, early_stopping_rounds=1000)
    ctb_pre= ctb_model.predict(x_va)

    print('The rmse of the CatRegression is:',metrics.mean_squared_error(y_va,ctb_pre))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, ctb_pre, 'r', label='cat predict')
    plt.legend(loc='best')
    plt.title('cat-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
************************************ 1 ************************************
(360, 9) (360,)
The rmse of the CatRegression is: 137.65860243967714
************************************ 2 ************************************
(360, 9) (360,)
The rmse of the CatRegression is: 91.00799259127238
************************************ 3 ************************************
(360, 9) (360,)
The rmse of the CatRegression is: 151.9945800734046
************************************ 4 ************************************
(360, 9) (360,)
The rmse of the CatRegression is: 118.63014436668998
************************************ 5 ************************************
(360, 9) (360,)
The rmse of the CatRegression is: 140.55848059764645
In [ ]:
cat_pred_test = ctb_model.predict(X_testt)
print('The rmse of the test is:',metrics.mean_squared_error(y_testt,cat_pred_test))
x = list(range(0, len(y_testt)))
plt.figure(figsize=(40,10))
plt.plot(x, y_testt, 'g', label='ground-turth')
plt.plot(x, cat_pred_test, 'r', label='cat predict')
plt.legend(loc='best')
plt.title('cat-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(ctb_model, f'{model_save_file}/ctb_model.pkl') 
result_model['Catboost(k)'] = metrics.mean_squared_error(y_test,cat_pred_test)
The rmse of the test is: 134.20532181711567

使用其他机器学习方法预测¶

岭回归¶

In [ ]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

ri_model = Ridge(alpha=0.5,fit_intercept=True)

lr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    ri_model.fit(x_tr, y_tr)
    lr_pred_valid = ri_model.predict(x_va)
    lr_preds[valid_index] = lr_pred_valid

    print('The rmse of the LR is:',metrics.mean_squared_error(y_va,lr_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, lr_pred_valid, 'r', label='lr predict')
    plt.legend(loc='best')
    plt.title('lr-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(360, 9) (360,)
The rmse of the LR is: 219.01021050096995
(360, 9) (360,)
The rmse of the LR is: 426.9549340311016
(360, 9) (360,)
The rmse of the LR is: 297.35225997165344
(360, 9) (360,)
The rmse of the LR is: 294.53694723908524
(360, 9) (360,)
The rmse of the LR is: 365.18053210044656
In [ ]:
ri_pred_test = ri_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,ri_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, ri_pred_test, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(ri_model, f'{model_save_file}/ri_model.pkl') 
result_model['RidgeRegression(k)'] = metrics.mean_squared_error(y_test,ri_pred_test)
The rmse of the test is: 326.04477768286097

线性回归¶

In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

lr_model = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)

lr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    lr_model.fit(x_tr, y_tr)
    lr_pred_valid = lr_model.predict(x_va)
    lr_preds[valid_index] = lr_pred_valid

    # print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
    print('The rmse of the LR is:',metrics.mean_squared_error(y_va,lr_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, lr_pred_valid, 'r', label='lr predict')
    plt.legend(loc='best')
    plt.title('lr-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(360, 9) (360,)
The rmse of the LR is: 314.1936119105932
(360, 9) (360,)
The rmse of the LR is: 328.36971054412
(360, 9) (360,)
The rmse of the LR is: 286.59978825058465
(360, 9) (360,)
The rmse of the LR is: 335.67162987133275
(360, 9) (360,)
The rmse of the LR is: 298.7482991436072
In [ ]:
lr_pred_test = lr_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,lr_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, lr_pred_test, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(lr_model, f'{model_save_file}/lr_model.pkl')
result_model['LinearRegression(k)'] = metrics.mean_squared_error(y_test,lr_pred_test)
The rmse of the test is: 317.42579423069157

使用MLP进行预测¶

In [ ]:
from sklearn.neural_network import MLPRegressor

mlp_model = MLPRegressor(solver='lbfgs',alpha=1e-5, hidden_layer_sizes=(40,40), max_iter=500, random_state=2023)

mlp_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    mlp_model.fit(x_tr, y_tr)
    mlp_pred_valid = mlp_model.predict(x_va)
    mlp_preds[valid_index] = mlp_pred_valid

    # print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
    print('The rmse of the MLP is:',metrics.mean_squared_error(y_va,mlp_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, mlp_pred_valid, 'r', label='mlp predict')
    plt.legend(loc='best')
    plt.title('mlp-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(360, 9) (360,)
The rmse of the MLP is: 172.16619071151197
(360, 9) (360,)
The rmse of the MLP is: 287.3069464332374
(360, 9) (360,)
The rmse of the MLP is: 177.94632641403044
(360, 9) (360,)
The rmse of the MLP is: 219.3194947093819
(360, 9) (360,)
The rmse of the MLP is: 190.39058581948484
In [ ]:
mlp_pred_test = mlp_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,mlp_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, mlp_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(mlp_model, f'{model_save_file}/mlp_model.pkl')
result_model['MLP(k)'] = metrics.mean_squared_error(y_test,mlp_pred_test)
The rmse of the test is: 212.5785076065758

SVR进行预测¶

In [ ]:
from sklearn.svm import SVR

svr_model = SVR(kernel ='rbf',
                degree = 3,
                coef0 = 0.0,
                tol = 0.001,
                C = 1.0,
                epsilon = 0.1,
                shrinking = True,
                cache_size = 200,
                verbose = False,
                max_iter = -1)

svr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    svr_model.fit(x_tr, y_tr)
    svr_pred_valid = svr_model.predict(x_va)
    svr_preds[valid_index] = svr_pred_valid

    # print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
    print('The rmse of the LR is:',metrics.mean_squared_error(y_va,svr_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, svr_pred_valid, 'r', label='svr predict')
    plt.legend(loc='best')
    plt.title('svr-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(360, 9) (360,)
The rmse of the LR is: 336.94506431335725
(360, 9) (360,)
The rmse of the LR is: 275.29736103784944
(360, 9) (360,)
The rmse of the LR is: 281.4711931902122
(360, 9) (360,)
The rmse of the LR is: 349.0051223555379
(360, 9) (360,)
The rmse of the LR is: 387.8624151085627
In [ ]:
svr_pred_test = svr_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,svr_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, svr_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(svr_model, f'{model_save_file}/svr_model.pkl')
result_model['SVR(k)'] = metrics.mean_squared_error(y_test,svr_pred_test)
The rmse of the test is: 378.8414160194384

使用GAUSS Rgression进行预测¶

In [ ]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

# 创建高斯过程回归模型对象
gp_model = GaussianProcessRegressor(kernel=RBF())

gp_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    gp_model.fit(x_tr, y_tr)
    gp_pred_valid = gp_model.predict(x_va)
    gp_preds[valid_index] = gp_pred_valid

    print('The rmse of the LR is:',metrics.mean_squared_error(y_va,gp_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, gp_pred_valid, 'r', label='gp predict')
    plt.legend(loc='best')
    plt.title('gp-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(360, 9) (360,)
The rmse of the LR is: 834.3952646708552
(360, 9) (360,)
The rmse of the LR is: 754.7523866750222
(360, 9) (360,)
The rmse of the LR is: 792.1270485352245
(360, 9) (360,)
The rmse of the LR is: 548.6501813244878
(360, 9) (360,)
The rmse of the LR is: 900.3367960162427
In [ ]:
gp_pred_test = gp_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,gp_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, gp_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(gp_model, f'{model_save_file}/gp_model.pkl')
result_model['GR(k)'] = metrics.mean_squared_error(y_test,gp_pred_test)
The rmse of the test is: 880.5795542654563

不同模型的效果比对分析(RMSE指标)¶

In [ ]:
import json

# 将字典保存为 JSON 文件
with open(model_performance_json, 'w') as json_file:
    json.dump(result_model, json_file)
In [ ]:
import plotly.express as px
import plotly.io as pio

with open(model_performance_json, 'r') as json_file:
    result_model = json.load(json_file)
    
result_model = dict(sorted(result_model.items(), key=lambda item: item[1]))
categories = list(result_model.keys())
values = list(result_model.values())

color_mapping = {}
for k, v in result_model.items():
    if v < 0.4:
        color_mapping[k] = "Tree-Base"
    else:
        color_mapping[k] = "Other"
    
# 创建柱状图
fig = px.bar(x=categories, y=values, title='Models Performance in ln(EF)', color=color_mapping)
fig.update_layout(template="seaborn")
# 显示图表
fig.show()
# 保存柱状图为 HTML 文件
pio.write_html(fig, file=model_performance_path)  

挑选合适的模型进行进一步的变量分析(Tree-Base)¶

In [ ]:
import shap
shap.initjs()

变量重要性分析¶

RF模型分析--变量重要性分析¶
In [ ]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns

temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
colors=px.colors.qualitative.Plotly

feat_importance=pd.DataFrame()
feat_importance["Importance"]=rf_model.feature_importances_
feat_importance.set_index(X_train.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()
Xgboost¶
In [ ]:
feat_importance=pd.DataFrame()
feat_importance["Importance"]=xgb_model.feature_importances_
feat_importance.set_index(X_train.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()
Catboost¶
In [ ]:
feat_importance=pd.DataFrame()
feat_importance["Importance"]=ctb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()
Lightbgm¶
In [ ]:
feat_importance=pd.DataFrame()
feat_importance["Importance"]=lgb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()

SHAP是由Shapley value启发的可加性解释模型。对于每个预测样本,模型都产生一个预测值,SHAP value就是该样本中每个特征所分配到的数值。 很明显可以看出,与上一节中feature importance相比,SHAP value最大的优势是SHAP能对于反映出每一个样本中的特征的影响力,而且还表现出影响的正负性。

In [ ]:
# 获取feature importance
plt.figure(figsize=(15, 5))

feat_importance=pd.DataFrame()
feat_importance["Importance"]=lgb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance', ascending=False)

plt.bar(range(len(X_traint.columns)), feat_importance['Importance'])
plt.xticks(range(len(X_traint.columns)), feat_importance.index, rotation=90, fontsize=14)
plt.title('Feature importance', fontsize=14)
plt.show()
In [ ]:
lgb_explainer = shap.TreeExplainer(lgb_model)
lgb_shap_values = lgb_explainer.shap_values(X_traint)
print(lgb_shap_values.shape)
(450, 9)
In [ ]:
shap.force_plot(lgb_explainer.expected_value, lgb_shap_values[0,:], X_traint.iloc[0,:])
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

对第一个实例的特征贡献图也可用 waterfall 方式展示

In [ ]:
shap.plots._waterfall.waterfall_legacy(
    lgb_explainer.expected_value, 
    lgb_shap_values[0], 
    feature_names=X_traint.columns)

上图的解释显示了每个有助于将模型输出从基值(我们传递的训练数据集上的平均模型输出)贡献到模型输出值的特征。将预测推高的特征以红色显示,将预测推低的特征以蓝色显示。

如果我们采取许多实例来聚合显示解释,如下图所示,将它们旋转 90 度,然后将它们水平堆叠,我们可以看到整个数据集的解释(在 Notebook 中,此图是交互式的)

In [ ]:
# visualize the training set predictions
shap.force_plot(lgb_explainer.expected_value, lgb_shap_values, X_traint)
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

下图中每一行代表一个特征,横坐标为SHAP值。一个点代表一个样本,颜色越红说明特征本身数值越大,颜色越蓝说明特征本身数值越小。

In [ ]:
shap.summary_plot(lgb_shap_values, X_traint)

shap value值排序的特征重要性

In [ ]:
shap.summary_plot(lgb_shap_values, X_traint, plot_type="bar")

SHAP也提供了部分依赖图的功能,与传统的部分依赖图不同的是,这里纵坐标不是目标变量y的数值而是SHAP值。可以观察各个特征的分布与目标shap值的关系。

In [ ]:
fig, axes = plt.subplots(len(input_cols)//3+1, 3, figsize=(20,30))
for i, col in enumerate(input_cols):
    shap.dependence_plot(col, lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[i//3,i%3])
    # shap.dependence_plot('Sand (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[0,2])
    # shap.dependence_plot('Silt (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,0])
    # shap.dependence_plot('Clay (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,1])
    # shap.dependence_plot('SOC (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,2])
    # shap.dependence_plot('TN (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[2,1])
    # shap.dependence_plot('C/N', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[2,2])
    # shap.dependence_plot('pH', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,0])
    # shap.dependence_plot('TN (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,1])
    # shap.dependence_plot('BD', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,2])
    # shap.dependence_plot('CEC', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,0])
    # shap.dependence_plot('N application', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
    # shap.dependence_plot('BNE', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
    # shap.dependence_plot('MAT (°C)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
    # shap.dependence_plot('MAP (mm)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
  • 对多个变量的交互进行分析

我们也可以多个变量的交互作用进行分析。一种方式是采用summary_plot描绘出散点图,如下:

In [ ]:
shap_interaction_values = shap.TreeExplainer(lgb_model).shap_interaction_values(X_traint)
plt.figure(figsize=(12,12))
shap.summary_plot(shap_interaction_values, X_traint, max_display=6)
plt.show()
<Figure size 1200x1200 with 0 Axes>

我们也可以用dependence_plot描绘两个变量交互下变量对目标值的影响。

In [ ]:
shap.dependence_plot(input_cols[0], lgb_shap_values, X_traint, interaction_index=input_cols[1], show=False)

也能可视化每种特征对于整体预测值的影响。

In [ ]:
shap.decision_plot(lgb_explainer.expected_value, lgb_shap_values, X_traint, ignore_warnings=True)
Catboost¶
In [ ]:
feat_importance=pd.DataFrame()
feat_importance["Importance"]=ctb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()